########### Algorithmic representation: Study 2 VPN audit ######
# Code for the replication of the analyses reported in the paper:
# Finding the white male: The prevalence and consequences of 
#algorithmic gender and race bias in political Google searches

# doi: XXXXXXXXXXXXXX

# Contact corresponding author for help/information with code problems


#  OVERVIEW OF CODE SECTIONS----------------------------------------------------

# 0: Data preparation
# H1: Tests of baseline bias hypothesis
# H2: Tests of distribution bias hypothesis


#if using r studio, display content outline with Ctrl + Shift + O


# 0 - Data preparation ----------------------
#libraries
library(tidyverse) # for tidy data manipulations
library(lme4) # For multilevel models
library(lmerTest) # for multilevel models
library(marginaleffects) # extracting results from multilevel models
library(sjPlot) # exporting regression results
library(sjmisc) #exporting regresion resuls
library(psych) # for corr.test()
library(officer) # for printing out tables
library(ggplot2) 
library(ggrepel)
library(ggdist)
library(viridis)
library(lemon)


#enter local path to the folder containing the replication files
path_to_folder <- "" 
setwd(path_to_folder) #set working directory to that folder

data <- read.csv("data/vpn_data_study2.csv", sep = ",") #data from first vpn audit
country_data <- read.csv("data/country_data_vpn.csv", sep = ";") #external data about countries and their legislative bodies (from interpariamentary-union)


# the following lines perform basic transformations to prepare the data
data <- data %>%
  mutate(search_term = query) %>%
  separate(.,query,
           into = c("country", "chamber"), sep = "_") %>% #split query info into variables
  mutate(agent_id = file, #rename variable for clarity
         number_male = counter_male, #rename varibale for clarity in line with previous study
         number_female = counter_female,
         unique_id = uniq_id) %>%
  dplyr::select(-uniq_id,-counter_male, -counter_female) %>% #remove unnecessary variables
  drop_na(number_of_persons) %>% #remove missings for this variable as these cases involve some technical error in data collection
  mutate(
    #recode the region in which the virtual agent was deployed
    region = case_when(
     region == "au" ~ "Australia",
     region == "br" ~ "Brazil",
     region == "ca" ~ "Canada",
     region == "de" ~ "Germany",
     region == "in" ~ "India",
     region == "is" ~ "Israel",
     region == "ja" ~ "Japan",
     region == "sa" ~ "Saudi Arabia",
     region == "za" ~ "South Africa",
     region == "ch" ~ "Switzerland",
     region == "us" ~ "United States"),
    gender_ratio = (number_female - number_male) / (number_female + number_male),
  #calculate the same measures of algorithmic representation as in study 1
  presence_female = case_when(
    number_female >= 1 ~ 1,
    number_female < 1 ~ 0),
  presence_male = case_when(
    number_male >= 1 ~ 1,
    number_male < 1 ~ 0),
  share_female = number_female / number_of_persons,
  #clean up some of the characters
  country = case_when(
    country == "southafrica"~ "South Africa",
    country == "saudiarabia" ~ "Saudi Arabia",
    country == "usa" ~ "United States",
    TRUE ~ str_to_title(country)),
  #unique id for the query
    query_id = paste0(country,"_",chamber))

#get all the case names (this will be used later)
case_names <- unique(data$query_id)

#prepare external country data (same as study 1)
country_info <- country_data %>%
  filter(main_language == "majority language") %>%
  pivot_longer(
    cols = c(lower_share, upper_share),
    names_to = c("chamber","share"),
    names_sep = "_",
    values_to = "representation")%>%
  mutate(
    query_id = paste0(country,"_",chamber),
    world_region = region)%>%
  dplyr::select(representation, query_id, lower_seats,	lower_women,
         gggi_score, gggi_rank, world_region, quota,
         upper_seats,	upper_women,	global_rank) %>%
  filter(query_id %in% case_names)

#merge data sets
data <-data %>%
  left_join(., country_info, by = "query_id",
            multiple = "all")

#finally, we create per chamber data subsets to run separate analyses
d_lower <- data %>%
  filter(chamber == "lower") 

d_upper <- data %>%
  filter(chamber == "upper") 


# H1 - Baseline bias ----------------------
#below we calculatate the mixed effects models with random
# intercepts at the country, region (where agent is situated), and agent levels. 
# As in study 1, we estimate women's representation in search engine output
# as the fixed effect intercept for different measures

## number female -------------
number_female_lower <- lmer(data = d_lower,
                  number_female ~ 1 +
                 (1 | region) + (1 | country) + (1 | agent_id) )

number_female_upper <- lmer(data = d_upper,
                            number_female ~ 1 +
                              (1 | region) + (1 | country) + (1 | agent_id) )


## number male -------------
number_male_lower <- lmer(data = d_lower,
                            number_male ~ 1 +
                          (1 | region) + (1 | country) + (1 | agent_id) )
number_male_upper <- lmer(data = d_upper,
                            number_male ~ 1  +
                          (1 | region) + (1 | country) + (1 | agent_id) )


## presence female -------------
presence_female_lower <- glmer(data = d_lower,
                         presence_female ~ 1 +
                           (1 | region) + (1 | country) + (1 | agent_id),
                         family = "binomial")

presence_female_upper <- glmer(data = d_upper,
                               presence_female ~ 1 +
                                 (1 | region) + (1 | country) + (1 | agent_id),
                               family = "binomial")

## presence male -------------
presence_male_lower <- glmer(data = d_lower,
                               presence_male ~ 1 +
                               (1 | region) + (1 | country) + (1 | agent_id),
                             family = "binomial")

presence_male_upper <- glmer(data = d_upper,
                               presence_male ~ 1 +
                               (1 | region) + (1 | country) + (1 | agent_id),
                             family = "binomial")

## share female -------------

share_female_lower <- lmer(data = d_lower,
                      share_female ~ 1 +
                      (1 | region) + (1 | country) + (1 | agent_id) )
share_female_upper <- lmer(data = d_upper,
                           share_female ~ 1 +
                           (1 | region) + (1 | country) + (1 | agent_id) )


# The functions below create output tables that are in the appendix;
# They also show the results for Table 1 (study 2) in the main manuscript

#The code below produces an output of the upper part of Table C1.1b in the appendix;
tab_model(share_female_lower, presence_female_lower,
          presence_male_lower, number_female_lower, number_male_lower,
          file = "outputs/Study2_TableC1.1b_LowerChamber.doc")

#The code below produces an output of the lower part of Table C1.1b in the appendix;
tab_model(share_female_upper, presence_female_upper,
          presence_male_upper, number_female_upper, number_male_upper,
          file = "outputs/Study2_TableC1.1b_UpperChamber.doc")


# H2: Distribution bias ---------------------------------------
# Below we test the distribution bias hypothesis in two ways
# First run correlations on country-level data sets
# Second, we repeat the same models as above but with representation
# as a predictor (along with control variables)

## country-level correlations -------------------- 

# Note that we do not split the data per chamber as the
# case numbers are too small
data %>%
  # aggregate data to country-level
  group_by(country, chamber) %>%
  summarise(
    representation = mean(representation, na.rm = TRUE),
    share_women = mean(share_female, na.rm = TRUE))%>%
  ungroup() %>%
  # run correlation test data to country-level
  select(share_women, representation) %>%
  psych::corr.test() %>%
  print(short = FALSE)




## larger mixed effect model ---------------------

#this model is reported in the supplemental materials as additional analysis

data <- data %>%
  #add an additional variable distinguishing between "domestic" vs. foreign searches
  mutate(
    location = case_when(
      country == region ~ "home searches",
      country != region ~ "foreign searches"))

share_combined_additional <- lmer(data = data,
                           share_female*100 ~ 
                             representation + chamber +
                             gggi_score + quota + location +
                             (1 | region) + (1 | country) + (1 | agent_id) )

number_combined_additional <- lmer(data = data,
                                  number_female ~ 
                                    representation + chamber +
                                    gggi_score + quota + location +
                                    (1 | region) + (1 | country) + (1 | agent_id) )

presence_combined_additional <- glmer(data = data,
                                  presence_female ~ 
                                    representation + chamber +
                                    gggi_score + quota + location +
                                    (1 | region) + (1 | country) + (1 | agent_id),
                                  family = "binomial")

#create output for Table C2.2 in the appendix
tab_model(share_combined_additional, number_combined_additional, presence_combined_additional,
          file = "outputs/Study2_TableC2.2_AdditionalModelsWithControls.doc")


# additional analyses ------------------------
## across VPN locations  ---------------------

#reproduce Figure C2.1 in the Appendix

#aggregate data for visualization
data_aggregated <-data %>%
  group_by(country, chamber,region, agent_id) %>%
  summarise(
    representation = mean(representation, na.rm = TRUE),
    number_persons = mean(number_of_persons, na.rm = TRUE),
    number_women = mean(number_female, na.rm = TRUE),
    number_men = mean(number_male, na.rm = TRUE),
    presence_women = mean(presence_female, na.rm = TRUE),
    presence_men = mean(presence_male, na.rm = TRUE),
    gender_ratio = mean(gender_ratio, na.rm = TRUE),
    share_women = mean(share_female, na.rm = TRUE)) %>%
  drop_na(share_women)

# get actual descriptive represenation (red ticks)
reps <- data_aggregated %>%
  group_by(country, chamber) %>%
  summarize(mean = mean(representation))


# plot!
country_plot_s2 <- data_aggregated%>%
  mutate(
    location = case_when(
      country == region ~ "home searches",
      country != region ~ "foreign searches")) %>%
  # filter(country != "Saudi Arabia") %>%
  ggplot(.,
         aes(y=country, x = share_women, group = chamber,
             fill = chamber)) +
  geom_vline(xintercept = .5, linetype = "dashed") +
  geom_point(data = reps,
             aes(x = mean/100, y = country, group = chamber),
             alpha = .8, col = "red", size = 4, shape = "|" ,
             position = position_dodge(.7))+
  stat_halfeye(position = position_dodge(.7), point_interval = "mean_qi",
               alpha = .7 )+
  theme_classic()+
  coord_flex_cart(bottom= capped_horisontal(), left = capped_vertical()) +
  labs(y = "", fill = "Chamber",
       x = "Share of women in Google image searches")+
  scale_x_continuous(labels = scales::percent_format(),
                     breaks = seq(0,1,.1), 
  ) +
  scale_fill_manual(values = c("#35b779","#3b528b"))+ #"#3b528b" ##440154"
  theme(legend.position = c(.85,.1),
        strip.background = element_blank())


ggsave(plot = country_plot_s2, filename = "outputs/study2_FigureC.2.1_RegionalVariation.jpeg", dpi = 800,
       width = 7, height = 7)



# Calculate means across all regions
mean_across_all <- data %>%
  group_by(chamber)%>%
  summarise(
    share_women = mean(share_female, na.rm = TRUE)
  ) %>%
  mutate(
    region = "Across locations"
  )

# Summarize data across chambers and regions
data_location <- data %>%
  group_by(chamber, region) %>%
  summarise(
    share_women = mean(share_female, na.rm = TRUE)
  ) %>%
  drop_na(share_women)

# Create a summary table with regions and their corresponding share of women
summary_table <- data_location %>%
  bind_rows(., mean_across_all) %>%
  select(region, chamber, share_women) %>%
  distinct() %>%
  pivot_wider(names_from = chamber, values_from = share_women, 
              values_fill = NA, names_prefix = "share_women_")


# Create Word document with the summary table
doc <- read_docx() %>%
  body_add_par("Table C2.1 Geographical variation in algorithmic representation based on virtual location of data collection", style = "heading 1") %>%
  body_add_table(value = summary_table, style = NULL)

# Save the Word document; this creates output equivalent to Table C2.1 in the Appendix.
print(doc, target = "outputs/study2_TableC2.1_RegionalVariation.docx")


## across agent locations  ---------------------
# reproduce Figure C2.2
agent_location_data <- data%>%
  group_by(region, agent_id, country, chamber) %>%
  filter(country %in% c("United States","Switzerland","Japan",
                        "Saudi Arabia",
                        "Brazil", "India")) %>%
  summarize(
    share_women = mean(share_female, na.rm = TRUE),
    representation = mean(representation, na.rm =TRUE))%>%
  mutate(
    id = paste0(agent_id,region),
    chamber = case_when(
      chamber == "lower" ~ "Lower chamber",
      chamber == "upper" ~ "Upper chamber"
    )) 


data_vline <- agent_location_data%>%
  ungroup() %>%
  group_by(chamber, country) %>%
  summarize(
    vline = mean(representation, na.rm = TRUE))


agent_variation_figure <-
  agent_location_data %>%
  ggplot(.,
         aes(y=reorder(id, share_women), x = share_women, color = region)) +
  geom_vline(data = data_vline,
             aes(xintercept = vline/100), col = "red", alpha = .6)+
  geom_vline(xintercept = .5, linetype = "dashed") +
  geom_point(alpha = .56, size = 1)+
  theme_classic()+
  facet_grid(country~chamber, switch = "y")+
  coord_flex_cart(bottom= capped_horisontal(), left = capped_vertical()) +
  labs(y = "Virtual agent searches", color = "VPN location",
       x = "Share of women in Google image searches")+
  scale_x_continuous(labels = scales::percent_format(),
                     breaks = seq(0,1,.1), 
                     limits = c(0,.6)
  ) +
  scale_color_viridis_d()+ #"#3b528b" ##440154"
  theme(
    axis.text.y=element_blank(),
    axis.ticks.y=element_blank(),
    strip.text.y = element_text(size = 11, face = "bold"),
    strip.text.x = element_text(size = 11, face = "bold"),
    strip.background = element_blank()) +
  guides(color = guide_legend(override.aes = list(size = 4))) 


ggsave(agent_variation_figure, dpi = 800, width = 7.5, height = 6.5,
       filename = "outputs/study2_FigureC2.2_AgentVariation.jpeg")





## Correlation plots study 2  ---------------------
#reproduce figure C2.3
data_aggregated_query <-data %>%
  group_by(country, chamber) %>%
  summarise(
    representation = mean(representation, na.rm = TRUE),
    share_women = mean(share_female, na.rm = TRUE)) %>%
  drop_na(share_women)


correlation_plot_s2 <-
  data_aggregated_query %>%
  ungroup() %>%
  mutate(
    labels = paste0(country, "-", chamber)
  ) %>%
  ggplot(aes(y = share_women, label = country, color = chamber, x = representation / 100)) +
  geom_abline(slope = 1, intercept = 0, color = "red", alpha = .4, linetype = "dashed") +
  geom_point(alpha = .85) +
  geom_text_repel(size = 2.5, segment.color = 'grey50') +
  geom_smooth(data = data_aggregated_query,
              aes(y = share_women, x = representation / 100),
              method = "lm",
              color = "grey50", fill = "grey80") +
  theme_classic() +
  coord_flex_cart(bottom = capped_horisontal(), left = capped_vertical()) +
  labs(x = "Share of women in parliament", 
       y = "Share of women in Google image searches") +
  scale_color_manual(values = c("#35b779", "#3b528b")) +
  scale_y_continuous(labels = scales::percent_format(), 
                     breaks = seq(0, 1, .1), limits = c(0, .6)) +
  scale_x_continuous(labels = scales::percent_format(), 
                     breaks = seq(0, 1, .1), limits = c(0, .6)) +
  theme(
    strip.text.x = element_text(size = 12, face = "bold"),
    strip.background = element_blank()
  )

ggsave(correlation_plot_s2, filename = "outputs/study2_FigureC2.3_CorrelationPlot.jpeg",
       width = 5, height = 5, dpi = 800)



